# === Import data & preprocessing ===
customer_raw <- fread("./data/marketing_campaign.csv")
alone_status <- c("Widow", "Single", "Divorced", "Alone", "Absurd")
customer <- customer_raw %>% select(-c(8, 27:28)) %>%
# for missing
mutate(Income = ifelse(is.na(Income), round(mean(Income, na.rm = TRUE)), Income),
Age = 2022 - Year_Birth,
AcceptedComp = ifelse(rowSums(across(starts_with("AcceptedCmp"))) + Response > 0, 1, 0),
Education = recode(Education, Basic = "Below some college", `2n Cycle` = "Associate degree",
Graduation = "Bachelor's degree", Master = "Master's degree", PhD = "Doctoral degree")
) %>%
filter(Marital_Status != "YOLO") %>%
select(1, 27, 3:8, 25, 9:14, 15, 28, 16:19) %>%
mutate_at(c(3:4, 9, 17), .funs = ~as.factor(.))
# Import original data & sample 0.5%
# bf_dat <- fread("./data/data-final.csv")
# chose_ind <- sample(c(1:nrow(bf_dat)), size = round(nrow(bf_dat) * 0.002), replace = FALSE)
# bf <- bf_dat[chose_ind, ]
# write_csv(bf, "./data/big_five.csv")
big_five_raw <- fread("./data/big_five.csv")
country_df <- countrycode::codelist %>%
select(3, 6, 17)
colnames(country_df)[3] <- "country"
big_five_raw <- left_join(big_five_raw, country_df, by = "country") %>%
select(-c(51:104, 106, 108))
big_five_raw <- big_five_raw %>%
mutate(num_zero = rowSums(big_five_raw[, 1:50] == 0)) %>%
filter(num_zero == 0) %>% select(-57)
big_five <- big_five_raw %>%
filter(IPC == 1) %>%
mutate_at(c(1:50, 55:56), .funs = ~as.factor(.)) %>%
mutate(
testlapse = as.numeric(testelapse),
lat = as.numeric(lat_appx_lots_of_err),
long = as.numeric(long_appx_lots_of_err),
country = country.name.en
) %>%
select(-c(51:54, 56)) %>%
na.omit()
big_five <- big_five %>% mutate(id = seq(1:nrow(big_five))) %>%
relocate(id, .before = EXT1)
# --- customer ---
# Check correlation
check_cor <- model.matrix(~., customer)[, -1]
corrplot(cor(check_cor[, -1]), method = "circle", type = "full",
tl.cex = 0.75, tl.col = "black")
# Some demogrpahics
summary(customer)
## ID Age Education Marital_Status
## Min. : 0 Min. : 26.0 Associate degree : 203 Absurd : 2
## 1st Qu.: 2830 1st Qu.: 45.0 Bachelor's degree :1127 Alone : 3
## Median : 5458 Median : 52.0 Below some college: 54 Divorced:232
## Mean : 5592 Mean : 53.2 Doctoral degree : 484 Married :864
## 3rd Qu.: 8425 3rd Qu.: 63.0 Master's degree : 370 Single :480
## Max. :11191 Max. :129.0 Together:580
## Widow : 77
## Income Kidhome Teenhome Recency Complain
## Min. : 1730 Min. :0.0000 Min. :0.0000 Min. : 0.00 0:2217
## 1st Qu.: 35528 1st Qu.:0.0000 1st Qu.:0.0000 1st Qu.:24.00 1: 21
## Median : 51790 Median :0.0000 Median :0.0000 Median :49.00
## Mean : 52251 Mean :0.4446 Mean :0.5058 Mean :49.15
## 3rd Qu.: 68307 3rd Qu.:1.0000 3rd Qu.:1.0000 3rd Qu.:74.00
## Max. :666666 Max. :2.0000 Max. :2.0000 Max. :99.00
##
## MntWines MntFruits MntMeatProducts MntFishProducts
## Min. : 0.00 Min. : 0.00 Min. : 0.0 Min. : 0.00
## 1st Qu.: 23.25 1st Qu.: 1.00 1st Qu.: 16.0 1st Qu.: 3.00
## Median : 173.00 Median : 8.00 Median : 67.0 Median : 12.00
## Mean : 303.92 Mean : 26.32 Mean : 167.1 Mean : 37.56
## 3rd Qu.: 504.75 3rd Qu.: 33.00 3rd Qu.: 232.0 3rd Qu.: 50.00
## Max. :1493.00 Max. :199.00 Max. :1725.0 Max. :259.00
##
## MntSweetProducts MntGoldProds NumDealsPurchases AcceptedComp
## Min. : 0.00 Min. : 0.00 Min. : 0.000 0:1630
## 1st Qu.: 1.00 1st Qu.: 9.00 1st Qu.: 1.000 1: 608
## Median : 8.00 Median : 24.00 Median : 2.000
## Mean : 27.08 Mean : 44.02 Mean : 2.323
## 3rd Qu.: 33.00 3rd Qu.: 56.00 3rd Qu.: 3.000
## Max. :263.00 Max. :362.00 Max. :15.000
##
## NumWebPurchases NumCatalogPurchases NumStorePurchases NumWebVisitsMonth
## Min. : 0.000 Min. : 0.000 Min. : 0.00 Min. : 0.000
## 1st Qu.: 2.000 1st Qu.: 0.000 1st Qu.: 3.00 1st Qu.: 3.000
## Median : 4.000 Median : 2.000 Median : 5.00 Median : 6.000
## Mean : 4.082 Mean : 2.664 Mean : 5.79 Mean : 5.314
## 3rd Qu.: 6.000 3rd Qu.: 4.000 3rd Qu.: 8.00 3rd Qu.: 7.000
## Max. :27.000 Max. :28.000 Max. :13.00 Max. :20.000
##
customer %>%
group_by(Education, Marital_Status) %>%
summarise(tot = n()) %>%
ggplot(aes(x = tot, y = Education, fill = Marital_Status)) +
geom_bar(stat = "identity") + theme_bw() +
scale_fill_brewer(palette = "RdYlGn") +
labs(
x = "Number of participants",
y = "Education",
title = "Frequency by Marital Status & Education"
)
# Total demographics
tbl_summary(dat = customer %>%
mutate(Complain = factor(Complain, levels = c(0, 1), labels = c("No", "Yes")),
AcceptedComp = factor(AcceptedComp, levels = c(0, 1), labels = c("No", "Yes"))) %>%
select(-1),
label = list(Marital_Status ~ "Marital Status", Kidhome ~ "Number of Kids",
Teenhome ~ "Number of Teens", Recency ~ "Number of Days Since Last Purchase",
Complain ~ "Complaint (in last 2 yrs)",
# products
MntWines ~ "Wine", MntFruits ~ "Fruits", MntMeatProducts ~ "Meat Products",
MntFishProducts ~ "Fish Products", MntSweetProducts ~ "Sweet Products",
MntGoldProds ~ "Gold Products",
# campaign & place
NumDealsPurchases ~ "Number of Purchases made with a discount",
AcceptedComp ~ "Customer Accepted the Offer",
NumWebPurchases ~ "Through the Company's Website", NumCatalogPurchases ~ "Using a Catelogue",
NumStorePurchases ~ "Directlt in Stores", NumWebVisitsMonth ~ "Website Visits (in last month)")) %>%
modify_table_styling(footnote = "Amount spent on a type of product in last two years",
rows = (label == "Wine"), columns = label) %>%
modify_table_styling(footnote = "Number of purchases made through a way",
rows = (label == "Through the Company's Website"),
columns = label) %>%
modify_caption("**Demographics of Customer (n=2238)**") %>%
bold_labels()
| Characteristic | N = 2,2381 |
|---|---|
| Age | 52 (45, 63) |
| Education | |
| Â Â Â Â Associate degree | 203 (9.1%) |
| Â Â Â Â Bachelor's degree | 1,127 (50%) |
| Â Â Â Â Below some college | 54 (2.4%) |
| Â Â Â Â Doctoral degree | 484 (22%) |
| Â Â Â Â Master's degree | 370 (17%) |
| Marital Status | |
| Â Â Â Â Absurd | 2 (<0.1%) |
| Â Â Â Â Alone | 3 (0.1%) |
| Â Â Â Â Divorced | 232 (10%) |
| Â Â Â Â Married | 864 (39%) |
| Â Â Â Â Single | 480 (21%) |
| Â Â Â Â Together | 580 (26%) |
| Â Â Â Â Widow | 77 (3.4%) |
| Income | 51,790 (35,528, 68,307) |
| Number of Kids | |
| Â Â Â Â 0 | 1,291 (58%) |
| Â Â Â Â 1 | 899 (40%) |
| Â Â Â Â 2 | 48 (2.1%) |
| Number of Teens | |
| Â Â Â Â 0 | 1,158 (52%) |
| Â Â Â Â 1 | 1,028 (46%) |
| Â Â Â Â 2 | 52 (2.3%) |
| Number of Days Since Last Purchase | 49 (24, 74) |
| Complaint (in last 2 yrs) | 21 (0.9%) |
| Wine2 | 173 (23, 505) |
| Fruits | 8 (1, 33) |
| Meat Products | 67 (16, 232) |
| Fish Products | 12 (3, 50) |
| Sweet Products | 8 (1, 33) |
| Gold Products | 24 (9, 56) |
| Number of Purchases made with a discount | 2 (1, 3) |
| Customer Accepted the Offer | 608 (27%) |
| Through the Company's Website3 | 4 (2, 6) |
| Using a Catelogue | 2 (0, 4) |
| Directlt in Stores | 5 (3, 8) |
| Website Visits (in last month) | 6 (3, 7) |
| 1 Median (IQR); n (%) | |
| 2 Amount spent on a type of product in last two years | |
| 3 Number of purchases made through a way | |
# --- Big Five ---
# Geographic map
mapworld <- borders("world", colour = "gray50", fill = "white")
big_five %>% ggplot() +
geom_point(aes(x = long, y = lat, color = country), size = 1.5) + mapworld +
theme_bw() + theme(legend.position = "none") +
labs(x = "Longitude", y = "Latitude", title = "Geographic Distribution of Participants (n=1243)")
# Frequency by country
bf_eda <- big_five %>%
group_by(country) %>% summarise(tot = n())
bf_eda[order(bf_eda$tot, decreasing = TRUE), ] %>%
top_n(50) %>%
ggplot(aes(x = tot, y = reorder(country, tot))) +
geom_bar(stat = "identity") + theme_bw() +
theme(legend.position = "none", axis.text.y = element_text(size = 7)) +
labs(x = "Number of Participants", y = "Country", title = "Number of Participants by Country")
set.seed(202212)
customer_con <- customer[, c(5:8, 10:16, 18:21)]
customer_scale <- scale(customer_con)
fviz_nbclust(customer_scale, FUNcluster = kmeans,
method = "gap_stat", iter.max = 50)
km <- kmeans(customer_scale, centers = 3, nstart = 20)
fviz_cluster(list(data = customer_scale, cluster = km$cluster),
ellipse.type = "convex", geom = "point",
labelsize = 5, palette = "Dark2",
main = "K-means Cluster Plot for Customer Data") + theme_bw()
# Gower distance
res_gower <- daisy(customer, metric = "gower")
summary(res_gower)
## 2503203 dissimilarities, summarized :
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.0000128 0.1861700 0.2356000 0.2361100 0.2853200 0.5671500
## Metric : mixed ; Types = I, I, N, N, I, I, I, I, N, I, I, I, I, I, I, I, N, I, I, I, I
## Number of objects : 2238
pam_clust <- pam(as.matrix(res_gower), diss = TRUE, k = 3)
# Find the number of clusters - 3
res_silhouette <- lapply(2:10, function(x) {
pam_clust <- pam(as.matrix(res_gower), diss = TRUE, k = x)
silhouette <- pam_clust$silinfo$avg.width
})
do.call(rbind, res_silhouette) %>%
as.data.frame() %>% mutate(cluster = c(2:10)) %>%
ggplot(aes(x = cluster, y = V1)) +
geom_line() + theme_bw() +
labs(x = "Number of clusters k", y = "Sihouette Width",
title = "Optimal number of clusters")
km_mod <- pam(as.matrix(res_gower), diss = TRUE, k = 3)
# Clustering Comparison
fossil::rand.index(km$cluster, km_mod$clustering)
## [1] 0.7355628
# --- Demographic for each cluster ---
tbl_summary(by = clust,
data = customer %>%
mutate(clust = km_mod$clustering,
Complain = factor(Complain, levels = c(0, 1), labels = c("No", "Yes")),
AcceptedComp = factor(AcceptedComp, levels = c(0, 1), labels = c("No", "Yes")),
clust = factor(clust, levels = c(1:3), labels = c("Cluster 1", "Cluster 2", "Cluster 3"))) %>%
select(-1),
label = list(Marital_Status ~ "Marital Status", Kidhome ~ "Number of Kids",
Teenhome ~ "Number of Teens", Recency ~ "Number of Days Since Last Purchase",
Complain ~ "Complaint (in last 2 yrs)",
# products
MntWines ~ "Wine", MntFruits ~ "Fruits", MntMeatProducts ~ "Meat Products",
MntFishProducts ~ "Fish Products", MntSweetProducts ~ "Sweet Products",
MntGoldProds ~ "Gold Products",
# campaign & place
NumDealsPurchases ~ "Number of Purchases made with a discount",
AcceptedComp ~ "Customer Accepted the Offer",
NumWebPurchases ~ "Through the Company's Website", NumCatalogPurchases ~ "Using a Catelogue",
NumStorePurchases ~ "Directlt in Stores", NumWebVisitsMonth ~ "Website Visits (in last month)")) %>%
add_p(list(all_categorical() ~ "chisq.test", all_continuous() ~ "aov")) %>% add_overall() %>%
modify_table_styling(footnote = "Amount spent on a type of product in last two years",
rows = (label == "Wine"), columns = label) %>%
modify_table_styling(footnote = "Number of purchases made through a way",
rows = (label == "Through the Company's Website"),
columns = label) %>%
modify_caption("**Demographics of Customers for each K-medoids Cluster (n=2238)**") %>%
bold_labels()
| Characteristic | Overall, N = 2,2381 | Cluster 1, N = 6651 | Cluster 2, N = 9071 | Cluster 3, N = 6661 | p-value2 |
|---|---|---|---|---|---|
| Age | 52 (45, 63) | 57 (46, 66) | 48 (41, 54) | 57 (49, 66) | <0.001 |
| Education | <0.001 | ||||
| Â Â Â Â Associate degree | 203 (9.1%) | 60 (9.0%) | 97 (11%) | 46 (6.9%) | |
| Â Â Â Â Bachelor's degree | 1,127 (50%) | 454 (68%) | 546 (60%) | 127 (19%) | |
| Â Â Â Â Below some college | 54 (2.4%) | 3 (0.5%) | 49 (5.4%) | 2 (0.3%) | |
| Â Â Â Â Doctoral degree | 484 (22%) | 46 (6.9%) | 71 (7.8%) | 367 (55%) | |
| Â Â Â Â Master's degree | 370 (17%) | 102 (15%) | 144 (16%) | 124 (19%) | |
| Marital Status | <0.001 | ||||
| Â Â Â Â Absurd | 2 (<0.1%) | 2 (0.3%) | 0 (0%) | 0 (0%) | |
| Â Â Â Â Alone | 3 (0.1%) | 0 (0%) | 1 (0.1%) | 2 (0.3%) | |
| Â Â Â Â Divorced | 232 (10%) | 69 (10%) | 91 (10%) | 72 (11%) | |
| Â Â Â Â Married | 864 (39%) | 110 (17%) | 386 (43%) | 368 (55%) | |
| Â Â Â Â Single | 480 (21%) | 155 (23%) | 214 (24%) | 111 (17%) | |
| Â Â Â Â Together | 580 (26%) | 299 (45%) | 195 (21%) | 86 (13%) | |
| Â Â Â Â Widow | 77 (3.4%) | 30 (4.5%) | 20 (2.2%) | 27 (4.1%) | |
| Income | 51,790 (35,528, 68,307) | 70,713 (59,925, 79,803) | 34,176 (25,238, 42,394) | 58,100 (48,918, 67,442) | <0.001 |
| Number of Kids | <0.001 | ||||
| Â Â Â Â 0 | 1,291 (58%) | 621 (93%) | 142 (16%) | 528 (79%) | |
| Â Â Â Â 1 | 899 (40%) | 44 (6.6%) | 729 (80%) | 126 (19%) | |
| Â Â Â Â 2 | 48 (2.1%) | 0 (0%) | 36 (4.0%) | 12 (1.8%) | |
| Number of Teens | <0.001 | ||||
| Â Â Â Â 0 | 1,158 (52%) | 442 (66%) | 590 (65%) | 126 (19%) | |
| Â Â Â Â 1 | 1,028 (46%) | 216 (32%) | 305 (34%) | 507 (76%) | |
| Â Â Â Â 2 | 52 (2.3%) | 7 (1.1%) | 12 (1.3%) | 33 (5.0%) | |
| Number of Days Since Last Purchase | 49 (24, 74) | 46 (21, 72) | 50 (26, 77) | 52 (25, 73) | 0.044 |
| Complaint (in last 2 yrs) | 21 (0.9%) | 7 (1.1%) | 12 (1.3%) | 2 (0.3%) | 0.11 |
| Wine3 | 173 (23, 505) | 458 (265, 722) | 19 (7, 56) | 356 (134, 638) | <0.001 |
| Fruits | 8 (1, 33) | 36 (19, 80) | 3 (1, 7) | 9 (1, 32) | <0.001 |
| Meat Products | 67 (16, 232) | 319 (153, 534) | 15 (8, 34) | 92 (33, 184) | <0.001 |
| Fish Products | 12 (3, 50) | 63 (28, 125) | 4 (2, 12) | 12 (2, 40) | <0.001 |
| Sweet Products | 8 (1, 33) | 38 (17, 85) | 3 (1, 8) | 8 (0, 34) | <0.001 |
| Gold Products | 24 (9, 56) | 56 (31, 111) | 11 (5, 24) | 26 (10, 58) | <0.001 |
| Number of Purchases made with a discount | 2 (1, 3) | 1 (1, 2) | 2 (1, 3) | 2 (1, 4) | <0.001 |
| Customer Accepted the Offer | 608 (27%) | 271 (41%) | 153 (17%) | 184 (28%) | <0.001 |
| Through the Company's Website4 | 4 (2, 6) | 5 (3, 7) | 2 (1, 3) | 5 (3, 7) | <0.001 |
| Using a Catelogue | 2 (0, 4) | 5 (3, 7) | 0 (0, 1) | 2 (1, 4) | <0.001 |
| Directlt in Stores | 5 (3, 8) | 8 (6, 10) | 3 (3, 4) | 6 (4, 9) | <0.001 |
| Website Visits (in last month) | 6 (3, 7) | 3 (2, 5) | 7 (6, 8) | 5 (4, 7) | <0.001 |
| 1 Median (IQR); n (%) | |||||
| 2 One-way ANOVA; Pearson's Chi-squared test | |||||
| 3 Amount spent on a type of product in last two years | |||||
| 4 Number of purchases made through a way | |||||
# --- Heatmap ---
# Products
htp_1 <- customer %>% select(10:15) %>%
mutate(cluster = km_mod$clustering) %>%
pivot_longer(
1:6, names_to = "product", values_to = "value"
) %>%
group_by(cluster, product) %>%
summarise(amount = mean(value)) %>%
ggplot(aes(x = cluster, y = product, fill = amount)) +
geom_tile() + theme_bw() +
theme(legend.position = "bottom") +
scale_y_discrete(labels = c("Fish", "Fruit", "Gold", "Meat", "Sweet", "Wine")) +
scale_fill_distiller(palette = "OrRd") +
labs(x = "Cluster", y = "Product")
# Places
htp_2 <- customer %>% select(18:20) %>%
mutate(cluster = km_mod$clustering) %>%
pivot_longer(
1:3, names_to = "place", values_to = "value"
) %>%
group_by(cluster, place) %>%
summarise(number = mean(value)) %>%
ggplot(aes(x = cluster, y = place, fill = number)) +
geom_tile() + theme_bw() +
theme(legend.position = "bottom") +
scale_y_discrete(labels = c("Catalog", "Store", "Website")) +
scale_fill_distiller(palette = "OrRd") +
labs(x = "Cluster", y = "Place")
htp_1 + htp_2 +
plot_annotation(title = "Heatmap of Amount Spend and Number of Visits by K-medoids Cluster")
# --- Networks for continuous vars ---
lapply(1:length(unique(km_mod$clustering)), function(x) {
print(paste("Cluster", x, "with", sum(km$cluster == x), "subjects"))
cl <- customer_con[km_mod$clustering == x,]
net <- estimateNetwork(as.matrix(cl), default = "EBICglasso")
qgraph(net$graph, labels = names(cl), layout = "spring",
theme = "TeamFortress", label.cex = 1.5,
label.fill.horizontal = .7)
})
## [1] "Cluster 1 with 1045 subjects"
## [1] "Cluster 2 with 584 subjects"
## [1] "Cluster 3 with 609 subjects"
## [[1]]
## From To Weight
## 2 --- 3 -0.03
## 1 --- 5 0.28
## 1 --- 6 0.05
## 3 --- 6 -0.04
## 1 --- 7 0.29
## 3 --- 7 -0.23
## 5 --- 7 0.05
## 6 --- 7 0.05
## 1 --- 8 0.04
## 3 --- 8 -0.06
## 6 --- 8 0.19
## 7 --- 8 0.06
## 1 --- 9 0.12
## 4 --- 9 0.05
## 6 --- 9 0.18
## 7 --- 9 0.02
## 8 --- 9 0.18
## 4 --- 10 0.01
## 6 --- 10 0.09
## 8 --- 10 0.09
## 1 --- 11 -0.02
## 2 --- 11 0.4
## 3 --- 11 0.38
## 1 --- 12 0.12
## 3 --- 12 0.01
## 4 --- 12 0
## 5 --- 12 0.14
## 8 --- 12 0.01
## 9 --- 12 0.07
## 10 --- 12 0.03
## 11 --- 12 0.02
## 1 --- 13 0.29
## 3 --- 13 -0.02
## 5 --- 13 0.07
## 7 --- 13 0.13
## 8 --- 13 0.04
## 9 --- 13 0.04
## 10 --- 13 0.07
## 1 --- 14 0.12
## 5 --- 14 0.13
## 10 --- 14 0.02
## 12 --- 14 0.08
## 1 --- 15 -0.38
## 2 --- 15 0.04
## 3 --- 15 0.06
## 4 --- 15 -0.02
## 5 --- 15 0.06
## 6 --- 15 -0.03
## 7 --- 15 -0.03
## 8 --- 15 -0.04
## 9 --- 15 -0.01
## 11 --- 15 0.22
## 12 --- 15 0.44
## 13 --- 15 -0.05
## 14 --- 15 0
##
## [[2]]
## From To Weight
## 1 --- 2 0.03
## 1 --- 3 0.1
## 2 --- 3 0
## 3 --- 4 0.03
## 1 --- 5 0.15
## 3 --- 5 0.06
## 1 --- 6 0.01
## 3 --- 6 -0.07
## 4 --- 6 0.01
## 5 --- 6 0.03
## 3 --- 7 -0.05
## 5 --- 7 0.07
## 6 --- 7 0.07
## 2 --- 8 -0.04
## 3 --- 8 -0.07
## 5 --- 8 0.06
## 6 --- 8 0.29
## 7 --- 8 0.04
## 1 --- 9 0.01
## 3 --- 9 -0.04
## 5 --- 9 -0.12
## 6 --- 9 0.19
## 7 --- 9 0.07
## 8 --- 9 0.19
## 2 --- 10 -0.07
## 3 --- 10 -0.02
## 5 --- 10 0.09
## 8 --- 10 0.06
## 9 --- 10 0.1
## 2 --- 11 0.14
## 3 --- 11 0.25
## 5 --- 11 -0.05
## 6 --- 11 -0.04
## 7 --- 11 0.09
## 8 --- 11 -0.02
## 9 --- 11 -0.12
## 1 --- 12 0.03
## 5 --- 12 0.09
## 9 --- 12 0.3
## 10 --- 12 0.48
## 11 --- 12 0.22
## 2 --- 13 -0.01
## 5 --- 13 0.13
## 6 --- 13 0.02
## 7 --- 13 0.71
## 8 --- 13 0.03
## 10 --- 13 0.05
## 11 --- 13 0.16
## 12 --- 13 -0.03
## 1 --- 14 0.04
## 5 --- 14 0.47
## 6 --- 14 0.2
## 7 --- 14 0.08
## 8 --- 14 0.12
## 9 --- 14 0.1
## 10 --- 14 -0.11
## 11 --- 14 0.19
## 12 --- 14 0.12
## 13 --- 14 -0.11
## 1 --- 15 -0.15
## 3 --- 15 -0.13
## 5 --- 15 0.1
## 6 --- 15 0
## 9 --- 15 -0.03
## 11 --- 15 0.27
## 12 --- 15 0.03
## 13 --- 15 -0.12
## 14 --- 15 -0.19
##
## [[3]]
## From To Weight
## 2 --- 3 -0.02
## 1 --- 5 0.22
## 2 --- 5 -0.07
## 3 --- 5 -0.09
## 3 --- 6 -0.03
## 1 --- 7 0.27
## 3 --- 7 -0.2
## 4 --- 7 0.04
## 5 --- 7 0.01
## 6 --- 7 0.14
## 3 --- 8 -0.07
## 5 --- 8 -0.01
## 6 --- 8 0.28
## 7 --- 8 0.06
## 1 --- 9 0.05
## 2 --- 9 -0.01
## 3 --- 9 -0.03
## 4 --- 9 -0.01
## 5 --- 9 -0.03
## 6 --- 9 0.27
## 7 --- 9 0.03
## 8 --- 9 0.27
## 2 --- 10 -0.04
## 3 --- 10 0.08
## 4 --- 10 0.01
## 5 --- 10 0.08
## 6 --- 10 0.05
## 8 --- 10 0.11
## 9 --- 10 0.03
## 2 --- 11 0.28
## 3 --- 11 0.19
## 6 --- 11 -0.05
## 7 --- 11 0.04
## 8 --- 11 -0.04
## 10 --- 11 0.08
## 1 --- 12 0.11
## 2 --- 12 -0.08
## 3 --- 12 0.04
## 4 --- 12 0.02
## 5 --- 12 0.23
## 9 --- 12 0.07
## 10 --- 12 0.16
## 11 --- 12 0.04
## 1 --- 13 0.13
## 2 --- 13 -0.12
## 4 --- 13 0.02
## 5 --- 13 0.23
## 7 --- 13 0.38
## 8 --- 13 0.08
## 10 --- 13 0.05
## 11 --- 13 0.16
## 2 --- 14 -0.15
## 5 --- 14 0.24
## 6 --- 14 0.13
## 8 --- 14 0.08
## 9 --- 14 0.08
## 11 --- 14 0.11
## 12 --- 14 0.12
## 1 --- 15 -0.37
## 2 --- 15 0.07
## 3 --- 15 0.05
## 5 --- 15 0.17
## 7 --- 15 -0.04
## 8 --- 15 -0.03
## 9 --- 15 -0.08
## 11 --- 15 0.26
## 12 --- 15 0.21
## 13 --- 15 -0.06
## 14 --- 15 -0.08
# Try #class from 2 to 8
lca_dat <- big_five[, 2:51] %>% mutate_all(.funs = ~as.numeric(.))
f <- formula(paste("cbind(", paste(colnames(lca_dat), collapse = ","), ") ~ 1"))
lca_res <- mclapply(2:8, function(x) {
set.seed(202212)
lca <- poLCA(f, lca_dat, nclass = x, verbose = FALSE, nrep = 20)
return(lca)
}, mc.cores = num.cores)
df_bic <- data.frame(Class = 2:8, BIC = unlist(lapply(lca_res, function(obj) obj$bic)))
df_bic %>%
ggplot(aes(x = Class, y = BIC, label = round(BIC))) +
geom_line() + geom_point() +
geom_text(vjust = -.35) + theme_bw() +
labs(title = "BIC for Latent Class Analysis for Big Five Data")
# Choose 5 classes with smallest BIC
n_class <- df_bic$Class[which.min(df_bic$BIC)]
fit_lca <- lca_res[[n_class - 1]]
# --- Alluvial plots ---
# rename
score <- c("Excellent", "Very Good", "Good", "Fair", "Poor")
col_names <- c("PartyLife", "TalkLess", "SocialComfort", "Invisible", "TopicStarter",
"SayLittle", "ExtensiveContact", "AvoidAttention", "CentralPerson", "Quiet",
"Stressed", "Relaxed", "Worried", "FeelBlueLess", "Distubed",
"GetUpset", "MoodChange", "FreqMoodSwing", "Irritated", "FeelBlue",
"ConcernLess", "Interested", "Insult", "Sympathize", "NotInterestProb",
"SoftHeart", "NotInterestPpl", "TakeTime", "FeelEmo", "MakeEase",
"Prepared", "LeaveBelonging", "Meticulous", "MakeMess", "NoDelay",
"Forgetful", "LikeOrder", "ShirkDuty", "FollowSche", "Exacting",
"RichVocal", "DiffAbstract", "ViviImag", "NoInAbstract", "ExcelIdea",
"NoImag", "QuickUnderstand", "DiffWord", "SlowReflect", "FullIdea")
five_factor <- data.frame(
fact_name = c("Extraversion", "EmoStablility", "Agreeableness", "Conscientious", "Intellect"),
factr = c("EXT", "EST", "AGR", "CSN", "OPN"))
# set pos or neg answers
dirct <- data.frame(variable = c(paste("EXT", seq(10), sep = ""),
paste("EST", seq(10), sep = ""),
paste("AGR", seq(10), sep = ""),
paste("CSN", seq(10), sep = ""),
paste("OPN", seq(10), sep = "")),
dirct = c(rep(c(1, 0), 5),
0, 1, 0, 1, rep(0, 6),
0, 1, 0, 1, 0, 1, 0, 1, 1, 1,
1, 0, 1, 0, 1, 0, 1, 0, 1, 1,
1, 0, 1, 0, 1, 0, 1, 1, 1, 1))
# combine lca clusters
dat_clust <- lca_dat %>%
mutate(clust = fit_lca$predclass)
dat_forshow <- dat_clust %>%
pivot_longer(1:50, names_to = "variable", values_to = "value") %>%
group_by(variable, clust, value) %>%
summarise(tot = n()) %>%
pivot_wider(c(1:2), names_from = "value", values_from = "tot")
dat_forshow[is.na(dat_forshow)] <- 0
# change frequency to percentage
# convert answer to score: subtract 6 by answers of negative variables
tot_dat <- dat_forshow %>%
pivot_longer(3:7, names_to = "answer", values_to = "number") %>%
group_by(variable, answer) %>% summarise(tot = sum(number)) %>%
mutate(comb = paste(variable, answer, sep = ""))
dat_forshow_1 <- dat_forshow %>%
pivot_longer(3:7, names_to = "answer", values_to = "number") %>%
mutate(comb = paste(variable, answer, sep = ""))
dat_forshow_1 <- left_join(dat_forshow_1, dirct, by = "variable")
dat_forshow_2 <- dat_forshow_1 %>%
mutate(
answer = as.numeric(answer),
score = ifelse(dirct == 0, 6 - answer, answer)) %>%
select(1, 2, 7, 4, 5)
# change label name
dat_forshow_3 <- left_join(dat_forshow_2,
tot_dat %>% select(3, 4), by = "comb") %>%
mutate(variable = variable.x, prct = number/tot * 100) %>%
select(8, 2:3, 9) %>% pivot_wider(names_from = "score", values_from = "prct")
# change factor name
dat_forshow_4 <- dat_forshow_3 %>%
mutate(factr = substr(variable, 1, 3))
dat_forshow_5 <-
left_join(dat_forshow_4,
data.frame(name = col_names, variable = c(paste("EXT", seq(10), sep = ""),
paste("EST", seq(10), sep = ""),
paste("AGR", seq(10), sep = ""),
paste("CSN", seq(10), sep = ""),
paste("OPN", seq(10), sep = ""))),
by = "variable")
dat_forshow_fin <- left_join(dat_forshow_5, five_factor, by = "factr")
make_alluvial <- function(x){
dat_forshow_fin[, -c(1, 8)] %>%
pivot_longer(c(2:6), names_to = "Score", values_to = "Prct") %>%
mutate(clust = as.factor(clust),
Score = factor(Score, levels = c(5:1), labels = score)) %>%
filter(str_detect(fact_name, x)) %>%
ggplot(aes(y = Prct, axis1 = name, axis2 = Score, axis3 = clust)) +
geom_alluvium(aes(fill = clust), width = 1/12) +
geom_stratum(width = 1/12, fill = "black", color = "grey") +
geom_label(stat = "stratum", aes(label = after_stat(stratum))) +
scale_x_discrete(limits = c("Question", "Score", "Cluster"), expand = c(.09, .09)) +
scale_y_continuous(breaks = NULL) +
scale_fill_brewer(type = "qual", palette = "Dark2") +
guides(fill = guide_legend(title = "Cluster")) +
theme_bw() +
labs(y = "Percentage",
title = paste("Alluvial Plot of", x, "by Cluster"))
}
# plot alluvial
lapply(unique(dat_forshow_fin$fact_name), function(x){
make_alluvial(x)})
# --- cluster 4: excellent on all five ---
clust_4 <- big_five %>%
mutate(clust = fit_lca$predclass) %>%
filter(clust == 4)
# by continent
clust_4 %>%
group_by(continent) %>% summarise(tot = n()) %>%
mutate(prct = tot/sum(fit_lca$predclass == 4) * 100) %>%
ggplot(aes(x = prct, y = continent)) +
geom_bar(stat = "identity")
# --- Cluster 5: most worst group---
clust_5 <- big_five %>%
mutate(clust = fit_lca$predclass) %>%
filter(clust == 5)
clust_5 %>%
group_by(continent) %>% summarise(tot = n()) %>%
mutate(prct = tot/sum(fit_lca$predclass == 5) * 100) %>%
ggplot(aes(x = prct, y = continent)) +
geom_bar(stat = "identity")